Last time I generated the T-cell signature file, I used all HLA genes and sex chromosome genes. The result showed some childrend do not have one paticular cluter. Then, I checked this cluster carefully and found that some ethnic group does not have certain HLA. The T-cell dataset I used was pooled sample analysis. This suggests that we have to eliminate the HLA, mitochondoria, ribozomal and genes on the sex chromosomes.
CD4+/CD45RO+ Memory T Cells https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/memory_t
CD4+/CD45RA+/CD25- Naive T cells https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/naive_t
CD4+/CD25+ Regulatory T Cells https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/regulatory_t
CD4+ Helper T Cells https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/cd4_t_helper
library(Seurat)
library(dplyr)
library(Matrix)
library(cowplot)
setwd("/Volumes/home/msuzuki/scRNA")
memory_t.data <- Read10X("memory_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = memory_t.data), value = TRUE)
gene_name <-rownames(memory_t.data)
'%ni%' <- Negate('%in%')
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(memory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(memory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(memory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(memory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(memory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(memory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(memory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
sex_chr <-read.table("~/scRNA_seq/sex_chr_human.txt")
sex_chr2 <-sex_chr[,2]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
memory_t.data2<-memory_t.data[rownames(memory_t.data)%in%not_ribo_auto,]
memory_t <- CreateSeuratObject(raw.data = memory_t.data2, min.cells = 3, min.genes = 200, project = "memory_t")
naive_t.data <- Read10X("naive_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = naive_t.data), value = TRUE)
gene_name <-rownames(naive_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(naive_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(naive_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(naive_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(naive_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(naive_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(naive_t.data), value = T)
hla.genes <-grep("^HLA", rownames(naive_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
naive_t.data2<-naive_t.data[rownames(naive_t.data)%in%not_ribo_auto,]
naive_t <- CreateSeuratObject(raw.data = naive_t.data2, min.cells = 3, min.genes = 200, project = "naive_t")
regulatory_t.data <- Read10X("regulatory_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = regulatory_t.data), value = TRUE)
gene_name <-rownames(regulatory_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(regulatory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(regulatory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(regulatory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(regulatory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(regulatory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(regulatory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(regulatory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
regulatory_t.data2<-regulatory_t.data[rownames(regulatory_t.data)%in%not_ribo_auto,]
regulatory_t <- CreateSeuratObject(raw.data = regulatory_t.data2, min.cells = 3, min.genes = 200, project = "regulatory_t")
memory_t@meta.data[, "protocol"] <- "memory_t"
naive_t@meta.data[, "protocol"] <- "naive_t"
regulatory_t@meta.data[, "protocol"] <- "regulatory_t"
mito.genes <- grep("^MT-", x = rownames(x = T_nmr@data), value = TRUE)
percent.mito <- Matrix::colSums(T_nmr@raw.data[mito.genes, ])/Matrix::colSums(T_nmr@raw.data)
T_nmr <- AddMetaData(T_nmr, percent.mito, "percent.mito")
'%ni%' <- Negate('%in%')
gene_name <-rownames(T_nmr@data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(T_nmr@data), value = T)
ribo2.genes <- grep("^RPL", rownames(T_nmr@data), value = T)
ribo.genes <-c(ribo1.genes,ribo2.genes)
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
sex_chr <-read.table("~/scRNA_seq/sex_chr_human.txt")
sex_chr2 <-sex_chr[,2]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
T_nm <-MergeSeurat(naive_t,memory_t,add.cell.id1 = "T_n",add.cell.id2 ="T_m")
T_nmr <-MergeSeurat(T_nm,regulatory_t,add.cell.id2 ="T_r")
T_nmr <- FilterCells(object = T_nmr, subset.names = c("nGene"),
low.thresholds = c(200), high.thresholds = c(3000))
T_nmr <- NormalizeData(object = T_nmr)
T_nmr <- ScaleData(object = T_nmr,vars.to.regress=c("nGene","nUMI"))
T_nmr <- FindVariableGenes(object = T_nmr, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
saveRDS(T_nmr,file = "T_nmr2.RDS")
length(rownames(T_nmr@data))
cat("Identified variable genes across the single cells = ", length(x = T_nmr@var.genes))
T_nmr <- RunPCA(object = T_nmr, pc.genes = T_nmr@var.genes, do.print = TRUE, pcs.print = 1:5,
genes.print = 5,pcs.compute = 20)
options(repr.plot.width = 8, repr.plot.height = 4)
PCElbowPlot(T_nmr, num.pc = 20)
PCAPlot(object = T_nmr, dim.1 = 1, dim.2 = 2)
options(repr.plot.width = 8, repr.plot.height = 8)
PCHeatmap(object = T_nmr, pc.use = 1:12, cells.use = 500, do.balanced = TRUE,
label.columns = FALSE, use.full = FALSE)
saveRDS(T_nmr,file = "~/scRNA_seq/T_nmr2.RDS")
Based on the PCA results, I decided to use PC1 to PC10 for TSENE.
T_nmr <- RunTSNE(T_nmr, dims.use = 1:10, do.fast = T)
T_nmr <- FindClusters(object = T_nmr,reduction.type = "pca", dims.use = 1:10, resolution = 0.6, save.SNN = T,force.recalc=T)
PrintFindClustersParams(object = T_nmr)
options(repr.plot.width = 6, repr.plot.height = 6)
p1 <- TSNEPlot(object = T_nmr,do.label = T,group.by = "protocol",pt.size = 0.2)
p2 <- TSNEPlot(object = T_nmr,do.label = T,pt.size = 0.2)
To save the images with higher resolution
library(Seurat)
library(dplyr)
library(Matrix)
library(cowplot)
load("~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/T_nmr2.Robj")
PrintFindClustersParams(object = T_nmr)
#pdf(file="~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/Deepa/TSNE_plots.pdf",width = 6,height = 6)
p1 <- TSNEPlot(object = T_nmr,do.label = T,group.by = "protocol",pt.size = 0.2)
p2 <- TSNEPlot(object = T_nmr,do.label = T,pt.size = 0.2)
#dev.off()
options(repr.plot.width = 8, repr.plot.height = 4)
VlnPlot(object = T_nmr, features.plot = c("CCR7", "CD7"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("TMIGD2", "SELL"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("LEF1", "LGALS1"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("S100A10", "ANXA2"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("DUSP1","CLIC1"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("CCL5", "HOPX"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("GZMA", "GZMK"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("FOS","IL7R"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("TIGIT","IL2RA"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("NKG7","CCR10"),point.size.use = 0.01)
Tcell.markers <- FindAllMarkers(T_nmr, only.pos = TRUE, min.pct = 0.3,test.use="bimod",logfc.threshold = 0.32)
Tcell.markers2 <-subset(Tcell.markers,Tcell.markers$p_val_adj<0.0001)
write.table(Tcell.markers2,file="~/scRNA_seq/Tcell.markers_auto.csv",sep=",")
head(Tcell.markers)
cat("Number of the marker gene = ",length(unique(Tcell.markers2$gene)))
save(T_nmr,file = "~/scRNA_seq/T_nmr2.Robj")
I calculated the median of signature genes in each cell cluster.
normalized<- data.frame(T_nmr@scale.data)
celltypes <-data.frame(T_nmr@ident)
geneLIST<-c(Tcell.markers2$gene)
sig_gene<-normalized[which(row.names(normalized)%in%geneLIST),]
Cluster_0<-sig_gene[,which(celltypes==0)]
Cluster_1<-sig_gene[,which(celltypes==1)]
Cluster_2<-sig_gene[,which(celltypes==2)]
Cluster_3<-sig_gene[,which(celltypes==3)]
Cluster_4<-sig_gene[,which(celltypes==4)]
Cluster_5<-sig_gene[,which(celltypes==5)]
Cluster_6<-sig_gene[,which(celltypes==6)]
#Cluster_7<-sig_gene[,which(celltypes==7)]
#Cluster_8<-sig_gene[,which(celltypes==8)]
#Cluster_9<-sig_gene[,which(celltypes==9)]
#Cluster_10<-sig_gene[,which(celltypes==10)]
#Cluster_11<-sig_gene[,which(celltypes==11)]
#Cluster_12<-sig_gene[,which(celltypes==12)]
#Cluster_13<-sig_gene[,which(celltypes==13)]
#Cluster_14<-sig_gene[,which(celltypes==14)]
j=nrow(sig_gene)
signature_data <-matrix(nrow=j,ncol=7)
for(i in 1:j)
signature_data[i,1]<-2^median(as.double(Cluster_0[i,]))
for(i in 1:j)
signature_data[i,2]<-2^median(as.double(Cluster_1[i,]))
for(i in 1:j)
signature_data[i,3]<-2^median(as.double(Cluster_2[i,]))
for(i in 1:j)
signature_data[i,4]<-2^median(as.double(Cluster_3[i,]))
for(i in 1:j)
signature_data[i,5]<-2^median(as.double(Cluster_4[i,]))
for(i in 1:j)
signature_data[i,6]<-2^median(as.double(Cluster_5[i,]))
for(i in 1:j)
signature_data[i,7]<-2^median(as.double(Cluster_6[i,]))
#for(i in 1:j)
# signature_data[i,8]<-2^median(as.double(Cluster_7[i,]))
#for(i in 1:j)
# signature_data[i,9]<-2^median(as.double(Cluster_8[i,]))
#for(i in 1:j)
# signature_data[i,10]<-2^median(as.double(Cluster_9[i,]))
#for(i in 1:j)
# signature_data[i,11]<-2^median(as.double(Cluster_10[i,]))
#for(i in 1:j)
# signature_data[i,12]<-2^median(as.double(Cluster_11[i,]))
#for(i in 1:j)
# signature_data[i,13]<-2^median(as.double(Cluster_12[i,]))
#for(i in 1:j)
# signature_data[i,14]<-10^median(as.double(Cluster_13[i,]))
#for(i in 1:j)
# signature_data[i,15]<-10^median(as.double(Cluster_14[i,]))
colnames(signature_data)<-c("cluster_0","cluster_1","cluster_2","cluster_3","cluster_4","cluster_5","cluster_6")
#,"cluster_7","cluster_8","cluster_9")
#,"cluster_10","cluster_11","cluster_12")
#,"cluster_13","cluster_14")
row.names(signature_data)<-row.names(sig_gene)
write.table(signature_data,"~/scRNA_seq/singarture_scRNA_Tcell_auto.txt",sep="\t",row.names = T,col.names = T,quote=F)
library(gplots)
library(RColorBrewer)
library(dendextend)
library(ggplot2)
library(biomaRt)
library(reshape2)
library(ggdendro)
library(scales)
Est_CellType <-read.csv(file="~/Documents/Tcell_scRNAseq/CIBERSORT.Output_Job41.csv",header = T)
row.names(Est_CellType) <-Est_CellType[,1]
Est_CellType<-Est_CellType[,-1]
group <-c(rep("A",53),rep("OA",46))
pdata <-cbind(Est_CellType[,1:7],group)
colorCodes <- c("A"="blue","OA"="red")
hc.cols <- hclust(dist((Est_CellType[,1:7])), "ward.D2")
dend <- as.dendrogram(hc.cols)
labels_colors(dend) <- colorCodes[pdata$group][order.dendrogram(dend)]
options(repr.plot.width = 12, repr.plot.height = 4)
plot(dend)
group <-c(rep("A",53),rep("OA",46))
pdata <-cbind(Est_CellType[,1:7],group)
dat.m<-melt(pdata,measure.vars =c('cluster_0','cluster_1','cluster_2','cluster_3','cluster_4','cluster_5','cluster_6'), id.vars = 'group')
p<-ggplot(dat.m)+
geom_boxplot(aes(color=group,y=value,x=variable))
p
pdata$sample <- row.names(pdata)
dat.m<-melt(pdata[,c(1:7,9)],id.vars = 'sample')
p <- ggplot(dat.m, aes(sample, value, fill = variable)) +
geom_bar(position = "fill", stat = "identity") +
theme(axis.text=element_text(size=6))
p
classes1=as.factor(pdata$group)
test<-pdata[,1:7]
stats_val <-matrix(ncol=4,nrow=7)
for(i in 1:7){
stats_val[i,1] <-median((as.double(test[which(classes1=="A"),i])))
stats_val[i,2] <-median((as.double(test[which(classes1=="OA"),i])))
stats_val[i,3] <-median((as.double(test[which(classes1=="A"),i])))-median((as.double(test[which(classes1=="OA"),i])))
stats_val[i,4] <-wilcox.test(as.double(test[which(classes1=="A"),i]),as.double(test[which(classes1=="OA"),i]))$p.value
}
row.names(stats_val)<-colnames(pdata)[1:7]
colnames(stats_val)<-c("Median_asthma","Median_obese_asthma","Diff(median, A-OAg)","pvalue(wilcox)")
stats_val
load("~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/T_nmr2.Robj")
options(repr.plot.width = 6, repr.plot.height = 6)
p1 <- TSNEPlot(object = T_nmr,do.label = T,group.by = "protocol",pt.size = 0.2)
p2 <- TSNEPlot(object = T_nmr,do.label = T,pt.size = 0.2)
C1_3.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 1, ident.2 = 3)
C4.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 4)
write.table(C1_3.markers,file="~/scRNA_seq/C1_3.markers.csv",sep=",")
write.table(C4.markers,file="~/scRNA_seq/C4.markers.csv",sep=",")
C0.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 0,logfc.threshold = 0.2)
#C0.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 0)
C1.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 1,logfc.threshold = 0.2)
C2.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 2,logfc.threshold = 0.2)
C3.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 3,logfc.threshold = 0.2)
write.table(C0.markers,file="~/scRNA_seq/C0.markers.csv",sep=",")
write.table(C1.markers,file="~/scRNA_seq/C1.markers.csv",sep=",")
write.table(C2.markers,file="~/scRNA_seq/C2.markers.csv",sep=",")
write.table(C3.markers,file="~/scRNA_seq/C3.markers.csv",sep=",")
Deepa and Andrew found issues on some expression data, and they decided to remove those samples from the analysis. I obtained a new file from Deepa to perform the cell subtype proportion analysis.
Exp <-read.table("~/scRNA_seq/Deepa/normcounts_allexcept_AJqc2",sep="\t",header=F)
colnames(Exp)<- c("ENSEMBL","a1", "a2", "a3", "a4", "a5", "a6", "a7", "a9", "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", "a30", "a31", "a32", "a33", "a34", "a36", "a37", "a38", "a39", "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a48", "a49", "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", "a60", "a61", "oa1", "oa2", "oa5", "oa6", "oa7", "oa8", "oa9", "oa10", "oa11", "oa12", "oa13", "oa14", "oa15", "oa16", "oa18", "oa19", "oa20", "oa21", "oa22", "oa23", "oa24", "oa26", "oa27", "oa28", "oa29", "oa30", "oa31", "oa32", "oa33", "oa34", "oa35", "oa36", "oa37", "oa38", "oa39", "oa40", "oa41", "oa42", "oa43", "oa44", "oa45", "oa46", "oa47", "oa48", "oa49", "oa50", "oa51", "oa53")
Exp[1:3,1:4]
I converted EnsID to gene symbol.
library(clusterProfiler)
library("org.Hs.eg.db")
Exp_gene <- bitr(Exp$EnsID, fromType = "ENSEMBL",
toType = c("SYMBOL"),
OrgDb = org.Hs.eg.db)
head(Exp_gene)
exp <-merge(Exp_gene,Exp,by="ENSEMBL")
head(exp)
SYMBOL<-exp$SYMBOL
aggdata_genename <-aggregate(exp[,3:dim(exp)[2]], by=list(SYMBOL),
FUN=mean, na.rm=TRUE) ## calculate mean value per gene (by gene name)
paste("Number of transcript = ",dim(aggdata_genename)[1],sep="")
paste("Number of sample = ",dim(aggdata_genename)[2],sep="")
write.table(aggdata_genename,file="~/scRNA_seq/Deepa/exp_status_byGene.txt",row.names = F,col.names = T,sep="\t",quote = F)
Est_CellType <-read.csv(file="~/scRNA_seq/Deepa/CIBERSORT.Output_Job9.csv",header = T)
row.names(Est_CellType) <-Est_CellType[,1]
Est_CellType<-Est_CellType[,-1]
group <-c(rep("A",57),rep("OA",48))
pdata <-cbind(Est_CellType[,1:7],group)
colorCodes <- c("A"="blue","OA"="red")
hc.cols <- hclust(dist((Est_CellType[,1:7])), "ward.D2")
dend <- as.dendrogram(hc.cols)
labels_colors(dend) <- colorCodes[pdata$group][order.dendrogram(dend)]
options(repr.plot.width = 12, repr.plot.height = 4)
plot(dend)
pdata_merge <-cbind(pdata[,c(1,2)],pdata[,3]+pdata[,4],pdata[,5],pdata[,6],pdata[,8])
colnames(pdata_merge)<-c("cluster_0","cluster_1","cluster_2/3","cluster_4","cluster_5","group")
head(pdata_merge)
dat.m<-melt(pdata_merge,measure.vars =c('cluster_0','cluster_1','cluster_2/3','cluster_4','cluster_5'), id.vars = 'group')
p<-ggplot(dat.m)+
geom_boxplot(aes(color=group,y=value,x=variable))
pdf(file="~/scRNA_seq/Deepa/CellTypeProportion.pdf",width = 8, height = 4)
p+theme_bw()
dev.off()
classes1=as.factor(pdata$group)
test<-pdata[,1:5]
stats_val <-matrix(ncol=4,nrow=5)
for(i in 1:5){
stats_val[i,1] <-median((as.double(test[which(classes1=="A"),i])))
stats_val[i,2] <-median((as.double(test[which(classes1=="OA"),i])))
stats_val[i,3] <-median((as.double(test[which(classes1=="A"),i])))-median((as.double(test[which(classes1=="OA"),i])))
stats_val[i,4] <-wilcox.test(as.double(test[which(classes1=="A"),i]),as.double(test[which(classes1=="OA"),i]))$p.value
}
row.names(stats_val)<-colnames(pdata_merge)[1:5]
colnames(stats_val)<-c("Median_asthma","Median_obese_asthma","Diff(median, A-OAg)","pvalue(wilcox)")
stats_val
Est_CellType <-read.csv(file="~/scRNA_seq/Deepa/CIBERSORT.Output_Job10.csv",header = T)
row.names(Est_CellType) <-Est_CellType[,1]
Est_CellType<-Est_CellType[,-1]
group <-c(rep("A",57),rep("OA",48))
pdata <-cbind(Est_CellType[,1:7],group)
colorCodes <- c("A"="blue","OA"="red")
hc.cols <- hclust(dist((Est_CellType[,1:6])), "ward.D2")
dend <- as.dendrogram(hc.cols)
labels_colors(dend) <- colorCodes[pdata$group][order.dendrogram(dend)]
options(repr.plot.width = 12, repr.plot.height = 4)
plot(dend)
dat.m<-melt(pdata,measure.vars =c('cluster_0','cluster_1','cluster_2','cluster_3','cluster_4','cluster_5'), id.vars = 'group')
p<-ggplot(dat.m)+
geom_boxplot(aes(color=group,y=value,x=variable))
p
classes1=as.factor(pdata$group)
test<-pdata[,1:6]
stats_val <-matrix(ncol=4,nrow=6)
for(i in 1:6){
stats_val[i,1] <-median((as.double(test[which(classes1=="A"),i])))
stats_val[i,2] <-median((as.double(test[which(classes1=="OA"),i])))
stats_val[i,3] <-median((as.double(test[which(classes1=="A"),i])))-median((as.double(test[which(classes1=="OA"),i])))
stats_val[i,4] <-wilcox.test(as.double(test[which(classes1=="A"),i]),as.double(test[which(classes1=="OA"),i]))$p.value
}
row.names(stats_val)<-colnames(pdata)[1:6]
colnames(stats_val)<-c("Median_asthma","Median_obese_asthma","Diff(median, A-OAg)","pvalue(wilcox)")
stats_val
Deepa requested the signature genes of cluster 5.
PrintFindClustersParams(object = T_nmr)
C5.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 5)
write.table(C5.markers,file="~/scRNA_seq/C5.markers.csv",sep=",")
Comments from Deepa regarding cluter5.
11 of the top 20 genes in Cluster 5 match with NK cells. Of these several overlap with cytotoxic T lymphocytes. So either the source of blood for the 10xGenomics analysis was someone with acute infection or their purity was impacted by an older lot of the T cell separation kit, where they used to not include NK cell markers. Cytotoxic cells are CD8+ so I am assuming that those were removed if the separation was done correctly. Either which way these cells are not CD4+ T cells, so I feel comfortable removing them from the figure.
We decided to add CD8+/CD45RA+ Naive Cytotoxic T Cells into the analysis
The data is from https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/naive_cytotoxic
I got an error message. Error: vector memory exhausted (limit reached?)
So, I analyzed the data upto PCA step.
memory_t.data <- Read10X("memory_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = memory_t.data), value = TRUE)
gene_name <-rownames(memory_t.data)
'%ni%' <- Negate('%in%')
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(memory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(memory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(memory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(memory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(memory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(memory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(memory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
sex_chr <-read.table("sex_chr_human.txt")
sex_chr2 <-sex_chr[,2]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
memory_t.data2<-memory_t.data[rownames(memory_t.data)%in%not_ribo_auto,]
memory_t <- CreateSeuratObject(raw.data = memory_t.data2, min.cells = 3, min.genes = 200, project = "memory_t")
naive_t.data <- Read10X("naive_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = naive_t.data), value = TRUE)
gene_name <-rownames(naive_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(naive_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(naive_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(naive_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(naive_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(naive_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(naive_t.data), value = T)
hla.genes <-grep("^HLA", rownames(naive_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
naive_t.data2<-naive_t.data[rownames(naive_t.data)%in%not_ribo_auto,]
naive_t <- CreateSeuratObject(raw.data = naive_t.data2, min.cells = 3, min.genes = 200, project = "naive_t")
regulatory_t.data <- Read10X("regulatory_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = regulatory_t.data), value = TRUE)
gene_name <-rownames(regulatory_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(regulatory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(regulatory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(regulatory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(regulatory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(regulatory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(regulatory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(regulatory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
regulatory_t.data2<-regulatory_t.data[rownames(regulatory_t.data)%in%not_ribo_auto,]
regulatory_t <- CreateSeuratObject(raw.data = regulatory_t.data2, min.cells = 3, min.genes = 200, project = "regulatory_t")
naiveCyto_t.data <- Read10X("Naive_cytotoxic/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = naiveCyto_t.data), value = TRUE)
gene_name <-rownames(naiveCyto_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(regulatory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(regulatory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(regulatory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(regulatory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(regulatory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(regulatory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(regulatory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
naiveCyto_t.data2<-naiveCyto_t.data[rownames(naiveCyto_t.data)%in%not_ribo_auto,]
naiveCyto_t<- CreateSeuratObject(raw.data = regulatory_t.data2, min.cells = 3, min.genes = 200, project = "naiveCyto_t")
memory_t@meta.data[, "protocol"] <- "memory_t"
naive_t@meta.data[, "protocol"] <- "naive_t"
regulatory_t@meta.data[, "protocol"] <- "regulatory_t"
naiveCyto_t@meta.data[, "protocol"] <- "naiveCyto_t"
T_nm <-MergeSeurat(naive_t,memory_t,add.cell.id1 = "T_n",add.cell.id2 ="T_m")
T_nmr <-MergeSeurat(T_nm,regulatory_t,add.cell.id2 ="T_r")
T_nmrc <-MergeSeurat(T_nmr,naiveCyto_t,add.cell.id2 ="T_c")
T_nmrc <- FilterCells(object = T_nmrc, subset.names = c("nGene"),
low.thresholds = c(200), high.thresholds = c(3000))
T_nmrc <- NormalizeData(object = T_nmrc)
T_nmrc <- ScaleData(object = T_nmrc,vars.to.regress=c("nGene","nUMI"))
pdf("FindVariableGenes.pdf")
T_nmrc <- FindVariableGenes(object = T_nmrc, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
dev.off()
saveRDS(T_nmrc,file = "T_nmrc.RDS")
cat("Identified variable genes across the single cells = ", length(x = T_nmrc@var.genes))
T_nmrc <- RunPCA(object = T_nmrc, pc.genes = T_nmrc@var.genes, do.print = FALSE,
pcs.compute = 30)
saveRDS(T_nmrc,file = "T_nmrc2.RDS")
load("/Volumes/home/msuzuki/scRNA/T_nmrc2.Robj")
options(repr.plot.width = 8, repr.plot.height = 4)
PCElbowPlot(T_nmrc, num.pc = 20)
PCAPlot(object = T_nmrc, dim.1 = 1, dim.2 = 2)
options(repr.plot.width = 8, repr.plot.height = 8)
PCHeatmap(object = T_nmrc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE,
label.columns = FALSE, use.full = FALSE)
T_nmrc <- RunTSNE(T_nmrc, dims.use = 1:10, do.fast = T, check_duplicates = FALSE)
T_nmrc <- FindClusters(object = T_nmrc,reduction.type = "pca", dims.use = 1:10, resolution = 0.6, save.SNN = T,force.recalc=T)
PrintFindClustersParams(object = T_nmrc)
Deepa asked the gene signature file of cluter 4 and 6.
I loaded the robj "T_nmr2.Robj" to the R.
load("~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/T_nmr2.Robj")
library("Seurat")
PrintFindClustersParams(object = T_nmr)
p1 <- TSNEPlot(object = T_nmr,do.label = T,group.by = "protocol",pt.size = 0.2)
p2 <- TSNEPlot(object = T_nmr,do.label = T,pt.size = 0.2)
C4.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 4)
C6.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 6)
write.table(C4.markers,file="~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/Deepa/C4.markers.csv",sep=",")
write.table(C6.markers,file="~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/Deepa/C6.markers.csv",sep=",")
Deepa identified 157 candidate genes and of those 20 genes are significantly assoicaated with lung function. Therefore, I performed the analysis to see how many randomly selected 157 genes have significant association to lung function.
data <-read.csv("~/Dropbox (EinsteinMed)/Deepa//normcounts_excpt_AJqc_final_cha36b30rma22a26.csv",sep=",",header=T)
head(data)
cat("This dataset contains 2 lung function values and ",dim(data)[1]-2, "gene expression profile from ", dim(data)[2]-1," individuals")
cat("Removing zero expressed genes for all samples")
clean <-data[which(rowSums(data[,2:dim(data)[2]])!=0),]
row.names(clean)<-clean[,1]
clean <- clean[,-1]
cat("After removing those genes, we have ",dim(clean)[1]-2, "gene expression profile from ", dim(clean)[2]," individuals")
cat("Calculate the associations between lung function and gene expression")
cor_result <-matrix(nrow=dim(clean)[1]-2,ncol=2)
row.names(cor_result)<-row.names(clean)[3:dim(clean)[1]]
for(i in 3:dim(clean)[1])
{
erv <-as.numeric(clean[1,])
fev <-as.numeric(clean[2,])
test <-as.numeric(clean[i,])
cor_result[i-2,1] <-unlist(cor.test(erv,test)[3])
cor_result[i-2,2] <-unlist(cor.test(fev,test)[3])
}
summary(cor_result)
head(cor_result)
per_result <-matrix(nrow=1000,ncol=3)
for(i in 1:1000){
rnd_num <-sample(dim(cor_result)[1],157)
per_result[i,1] <-length(which(cor_result[rnd_num,1] < 0.05))
per_result[i,2] <-length(which(cor_result[rnd_num,2] < 0.05))
per_result[i,3] <-length(which((cor_result[rnd_num,2] < 0.05)&(cor_result[rnd_num,1] < 0.05)))
}
options(repr.plot.width = 4, repr.plot.height = 4)
hist(per_result[,1],main="ERV",breaks =25,xlim =c(0,60),xlab = "")
abline(v=39,col=2)
cat("p-value= ",length(which(per_result[,1]>(39)))+1/(1000+1))
hist(per_result[,2],main="FEV/FVC",breaks =25,xlim =c(0,60),xlab = "")
abline(v=55,col=2)
cat("p-value= ",length(which(per_result[,2]>(24+31)))+1/(1000+1))
hist(per_result[,3],main="ERV&FEVV/FVC",breaks =15,xlim =c(0,60),xlab = "")
abline(v=24,col=2)
cat("p-value= ",length(which(per_result[,3]>(24)))+1/(1000+1))